{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Evidently Metrics "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "try:\n",
    "    import evidently\n",
    "except:\n",
    "    !pip install git+https://github.com/evidentlyai/evidently.git"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn import datasets\n",
    "from sklearn import ensemble\n",
    "from sklearn import model_selection\n",
    "\n",
    "from evidently import ColumnMapping\n",
    "from evidently.options import ColorOptions\n",
    "from evidently.report import Report\n",
    "\n",
    "from evidently.metrics import ColumnDriftMetric\n",
    "from evidently.metrics import DataDriftTable\n",
    "from evidently.metrics import DatasetDriftMetric\n",
    "from evidently.metrics import ColumnCategoryMetric\n",
    "from evidently.metrics import ColumnDistributionMetric\n",
    "from evidently.metrics import ColumnValuePlot\n",
    "from evidently.metrics import ColumnQuantileMetric\n",
    "from evidently.metrics import ColumnCorrelationsMetric\n",
    "from evidently.metrics import ColumnValueListMetric\n",
    "from evidently.metrics import ColumnValueRangeMetric\n",
    "from evidently.metrics import DatasetCorrelationsMetric\n",
    "from evidently.metrics import ColumnRegExpMetric\n",
    "from evidently.metrics import ColumnSummaryMetric\n",
    "from evidently.metrics import ColumnMissingValuesMetric\n",
    "from evidently.metrics import DatasetSummaryMetric\n",
    "from evidently.metrics import DatasetMissingValuesMetric\n",
    "from evidently.metrics import ConflictTargetMetric\n",
    "from evidently.metrics import ConflictPredictionMetric\n",
    "from evidently.metrics import ClassificationQualityMetric\n",
    "from evidently.metrics import ClassificationClassBalance\n",
    "from evidently.metrics import ClassificationConfusionMatrix\n",
    "from evidently.metrics import ClassificationQualityByClass\n",
    "from evidently.metrics import ClassificationClassSeparationPlot\n",
    "from evidently.metrics import ClassificationProbDistribution\n",
    "from evidently.metrics import ClassificationRocCurve\n",
    "from evidently.metrics import ClassificationPRCurve\n",
    "from evidently.metrics import ClassificationPRTable\n",
    "from evidently.metrics import ClassificationLiftCurve\n",
    "from evidently.metrics import ClassificationLiftTable\n",
    "from evidently.metrics import ClassificationQualityByFeatureTable\n",
    "from evidently.metrics import ClassificationDummyMetric\n",
    "from evidently.metrics import RegressionQualityMetric\n",
    "from evidently.metrics import RegressionPredictedVsActualScatter\n",
    "from evidently.metrics import RegressionPredictedVsActualPlot\n",
    "from evidently.metrics import RegressionErrorPlot\n",
    "from evidently.metrics import RegressionAbsPercentageErrorPlot\n",
    "from evidently.metrics import RegressionErrorDistribution\n",
    "from evidently.metrics import RegressionErrorNormality\n",
    "from evidently.metrics import RegressionTopErrorMetric\n",
    "from evidently.metrics import RegressionErrorBiasTable\n",
    "from evidently.metrics import RegressionDummyMetric\n",
    "#from evidently.metrics import EmbeddingsDriftMetric"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## Prepare a Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#Dataset for Data Quality and Integrity\n",
    "adult_data = datasets.fetch_openml(name='adult', version=2, as_frame=True)\n",
    "adult = adult_data.frame\n",
    "\n",
    "adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]\n",
    "adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]\n",
    "\n",
    "adult_cur.iloc[:2000, 3:5] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#Dataset for binary label and probabilistic classifcation\n",
    "bcancer_data = datasets.load_breast_cancer(as_frame=True)\n",
    "bcancer = bcancer_data.frame\n",
    "\n",
    "bcancer_ref = bcancer.sample(n=300, replace=False)\n",
    "bcancer_cur = bcancer.sample(n=200, replace=False)\n",
    "\n",
    "bcancer_label_ref = bcancer_ref.copy(deep=True)\n",
    "bcancer_label_cur = bcancer_cur.copy(deep=True)\n",
    "\n",
    "model = ensemble.RandomForestClassifier(random_state=1, n_estimators=10)\n",
    "model.fit(bcancer_ref[bcancer_data.feature_names.tolist()], bcancer_ref.target)\n",
    "\n",
    "bcancer_ref['prediction'] = model.predict_proba(bcancer_ref[bcancer_data.feature_names.tolist()])[:, 1]\n",
    "bcancer_cur['prediction'] = model.predict_proba(bcancer_cur[bcancer_data.feature_names.tolist()])[:, 1]\n",
    "\n",
    "bcancer_label_ref['prediction'] = model.predict(bcancer_label_ref[bcancer_data.feature_names.tolist()])\n",
    "bcancer_label_cur['prediction'] = model.predict(bcancer_label_cur[bcancer_data.feature_names.tolist()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#Dataset for regression\n",
    "housing_data = datasets.fetch_california_housing(as_frame=True)\n",
    "housing = housing_data.frame\n",
    "\n",
    "housing.rename(columns={'MedHouseVal': 'target'}, inplace=True)\n",
    "housing['prediction'] = housing_data['target'].values + np.random.normal(0, 3, housing.shape[0])\n",
    "\n",
    "housing_ref = housing.sample(n=5000, replace=False)\n",
    "housing_cur = housing.sample(n=5000, replace=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## How to run Reports?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### Data Drift Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#dataset-level metrics\n",
    "data_drift_dataset_report = Report(metrics=[\n",
    "    DataDriftTable(num_stattest='kl_div', cat_stattest='psi'),    \n",
    "])\n",
    "\n",
    "data_drift_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_drift_dataset_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#report in a JSON format\n",
    "data_drift_dataset_report.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#report as a python object\n",
    "data_drift_dataset_report.as_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#column-level metrics\n",
    "data_drift_column_report = Report(metrics=[\n",
    "    ColumnDriftMetric('age'),\n",
    "    ColumnValuePlot('age'),  \n",
    "])\n",
    "\n",
    "data_drift_column_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_drift_column_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### Data Quality Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#dataset-level metrics\n",
    "data_quality_dataset_report = Report(metrics=[\n",
    "    DatasetCorrelationsMetric(),\n",
    "    \n",
    "])\n",
    "\n",
    "data_quality_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_quality_dataset_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#column-level metrics\n",
    "data_quality_column_report = Report(metrics=[\n",
    "    ColumnDistributionMetric(column_name=\"education\"), \n",
    "    ColumnQuantileMetric(column_name=\"education-num\", quantile=0.75), \n",
    "    ColumnCorrelationsMetric(column_name=\"education\"),\n",
    "    ColumnValueListMetric(column_name=\"relationship\", values=[\"Husband\", \"Unmarried\"]), \n",
    "    ColumnValueRangeMetric(column_name=\"age\", left=10, right=20),\n",
    "    ColumnCategoryMetric(column_name='education', category='Some-college'),\n",
    "    \n",
    "])\n",
    "\n",
    "data_quality_column_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_quality_column_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### Data Integrity Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#dataset-level metrics\n",
    "data_integrity_dataset_report = Report(metrics=[\n",
    "    DatasetSummaryMetric(),\n",
    "    DatasetMissingValuesMetric()\n",
    "    \n",
    "])\n",
    "\n",
    "data_integrity_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_integrity_dataset_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#column-level metrics\n",
    "data_integrity_column_report = Report(metrics=[\n",
    "    ColumnRegExpMetric(column_name=\"relationship\", reg_exp=r\".*child.*\"),\n",
    "    ColumnSummaryMetric(column_name=\"age\"),\n",
    "    ColumnMissingValuesMetric(column_name=\"education\"),\n",
    "\n",
    "    \n",
    "])\n",
    "\n",
    "data_integrity_column_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_integrity_column_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### Classification Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#label binary classification\n",
    "classification_report = Report(metrics=[\n",
    "    ClassificationQualityMetric(),\n",
    "    ClassificationClassBalance(),\n",
    "    ConflictTargetMetric(),\n",
    "    ConflictPredictionMetric(),\n",
    "    ClassificationConfusionMatrix(),\n",
    "    ClassificationQualityByClass(),\n",
    "    ClassificationDummyMetric(),\n",
    "    ClassificationQualityByFeatureTable(columns=['mean area', 'fractal dimension error']),\n",
    "])\n",
    "\n",
    "classification_report.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur)\n",
    "classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#probabilistic binary classification\n",
    "classification_report = Report(metrics=[\n",
    "    ClassificationQualityMetric(),\n",
    "    ClassificationClassBalance(),\n",
    "    ConflictTargetMetric(),\n",
    "    ConflictPredictionMetric(),\n",
    "    ClassificationConfusionMatrix(),\n",
    "    ClassificationQualityByClass(),\n",
    "    ClassificationClassSeparationPlot(),\n",
    "    ClassificationProbDistribution(),\n",
    "    ClassificationRocCurve(),\n",
    "    ClassificationPRCurve(),\n",
    "    ClassificationPRTable(),\n",
    "    ClassificationLiftCurve(),\n",
    "    ClassificationLiftTable(),\n",
    "    ClassificationQualityByFeatureTable(columns=['mean area', 'fractal dimension error']),\n",
    "])\n",
    "\n",
    "classification_report.run(reference_data=bcancer_ref, current_data=bcancer_cur)\n",
    "classification_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### Regression Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "regression_report = Report(metrics=[\n",
    "    RegressionQualityMetric(),\n",
    "    RegressionPredictedVsActualScatter(),\n",
    "    RegressionPredictedVsActualPlot(),\n",
    "    RegressionErrorPlot(),\n",
    "    RegressionAbsPercentageErrorPlot(),\n",
    "    RegressionErrorDistribution(),\n",
    "    RegressionErrorNormality(),\n",
    "    RegressionTopErrorMetric(),\n",
    "    RegressionErrorBiasTable(columns=['MedInc', 'AveRooms']),\n",
    "    RegressionDummyMetric(),\n",
    "    ConflictTargetMetric(),\n",
    "    ConflictPredictionMetric(),\n",
    "    \n",
    "])\n",
    "\n",
    "regression_report.run(reference_data=housing_ref, current_data=housing_cur)\n",
    "regression_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### How to set metric parameters?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#simple metric parameters\n",
    "data_integrity_column_report = Report(metrics=[\n",
    "    ColumnRegExpMetric(column_name=\"education\", reg_exp=r\".*-.*\", top=5),\n",
    "    ColumnRegExpMetric(column_name=\"relationship\", reg_exp=r\".*child.*\")\n",
    "])\n",
    "\n",
    "data_integrity_column_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_integrity_column_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#options\n",
    "color_scheme = ColorOptions(\n",
    "    primary_color=\"#5a86ad\",\n",
    "    fill_color=\"#fff4f2\",\n",
    "    zero_line_color=\"#016795\",\n",
    "    current_data_color= \"#c292a1\" ,\n",
    "    reference_data_color=\"#017b92\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data_drift_column_report = Report(metrics=[\n",
    "    ColumnDriftMetric('age'),\n",
    "    ColumnDriftMetric('age', stattest='psi'),\n",
    "],\n",
    "                                  options=[color_scheme]\n",
    ")\n",
    "\n",
    "data_drift_column_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_drift_column_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to use Metrics with raw data?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_drift_column_report = Report(metrics=[\n",
    "    ColumnDriftMetric('age'),\n",
    "    ColumnValuePlot('age'),  \n",
    "    \n",
    "],\n",
    "    options={\"render\": {\"raw_data\": True}}\n",
    "   )\n",
    "\n",
    "data_drift_column_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_drift_column_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using metrics with Raw data (for a single metric)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_drift_column_report = Report(metrics=[\n",
    "    ColumnDriftMetric('age', options={\"render\": {\"raw_data\": True}}),\n",
    "    ColumnValuePlot('age'),  \n",
    "    \n",
    "]\n",
    "   )\n",
    "\n",
    "data_drift_column_report.run(reference_data=adult_ref, current_data=adult_cur)\n",
    "data_drift_column_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Support Evidently\n",
    "Did you find the example useful? Star Evidently on GitHub to contribute back! This helps us continue creating free open-source tools for the community. https://github.com/evidentlyai/evidently"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
